Setup

library(ngsReports)
library(magrittr)
library(scales)
library(pander)
library(tidyverse)
theme_set(theme_bw())
options(scipen = 999)
if (interactive()) setwd(here::here("R"))
deMuxFqc <- list.files("../3_demuxTrimmed/FastQC/", pattern = "zip", full.names = TRUE) %>%
  FastqcDataList()
alnFqc <- list.files("../4_aligned/FastQC/", pattern = "zip", full.names = TRUE) %>%
  FastqcDataList()
oryGC <- read_rds("oryGC.RDS")

Comparison of Reads with Alignments

Alignments were filtered for:

  • Unique alignments. Any with supplementary alignments were removed by filtering on the SA tag added by bwa.
  • Mapping Quality > 30. Given the PHRED-scaled MAPQ score, this equates to an approximate \(p = 0.001\) for an incorrect alignment.
list(
  readTotals(deMuxFqc) %>%
    mutate(Sample = str_remove(Filename, ".[12].fq.gz")) %>%
    group_by(Sample) %>%
    summarise(Total_Sequences = sum(Total_Sequences)) %>%
    mutate(Type = "Pre-Alignment"),
  readTotals(alnFqc) %>%
    mutate(Sample = str_remove(Filename, ".bam"),
           Type = "Post-Alignment") %>%
    dplyr::select(Sample, Type, Total_Sequences)
) %>%
  bind_rows() %>%
  mutate(Population = case_when(
    grepl("gc", Sample) ~ "1996",
    grepl("ora", Sample) ~ "2012",
    !grepl("(gc|ora)", Sample) ~ "2010"
  )) %>%
  # filter(Population != "2010") %>%
  ggplot(aes(Sample, Total_Sequences, fill = Type)) +
  geom_bar(stat = "identity", position = "dodge") +
  coord_flip() +
  facet_wrap(~Population, scales = "free") + 
  scale_y_continuous(labels = comma) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
*Comparison of library sizes before and after alignment. Given the aggressive filtering of alignments, this shows an acceptable rate of alignment across all samples*

Comparison of library sizes before and after alignment. Given the aggressive filtering of alignments, this shows an acceptable rate of alignment across all samples

list(
  readTotals(deMuxFqc) %>%
    mutate(Sample = str_remove(Filename, ".[12].fq.gz")) %>%
    group_by(Sample) %>%
    summarise(Total_Sequences = sum(Total_Sequences)) %>%
    mutate(Type = "Pre-Alignment"),
  readTotals(alnFqc) %>%
    mutate(
      Sample = str_remove(Filename, ".bam"),
      Type = "Post-Alignment"
    ) %>%
    dplyr::select(Sample, Type, Total_Sequences)
) %>%
  bind_rows() %>%
  mutate(
    Population = case_when(
      grepl("gc", Sample) ~ "1996",
      grepl("ora", Sample) ~ "2012",
      !grepl("(gc|ora)", Sample) ~ "2010"
    )
  ) %>%
  # filter(Population != "2010") %>%
  spread(key = Type, value = Total_Sequences) %>%
  mutate(Rate = `Post-Alignment`/`Pre-Alignment`) %>%
  ggplot(aes(Sample, Rate, fill = Population)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  facet_wrap(~Population, scales = "free_y") +
  scale_y_continuous(label = percent, expand = expand_scale(c(0, 0.05))) +
  labs(y = "Alignment Rate")
*Alignment rate for each sample after filtering for uniquely mapped reads with high mapping qualities. The two samples gc2700 and gc2709 appear to be outliers.*

Alignment rate for each sample after filtering for uniquely mapped reads with high mapping qualities. The two samples gc2700 and gc2709 appear to be outliers.

readTotals(alnFqc) %>%
  mutate(
    Sample = str_remove(Filename, ".bam"),
    Population = case_when(
      grepl("gc", Sample) ~ "1996",
      grepl("ora", Sample) ~ "2012",
      !grepl("(gc|ora)", Sample) ~ "2010"
    )
  ) %>%
  dplyr::select(Sample, Population, Total_Sequences) %>%
  group_by(Population) %>%
  summarise(
    Samples = n(),
    `Smallest Library` = min(Total_Sequences),
    `Median Library` = median(Total_Sequences),
    `Largest Library` = max(Total_Sequences),
    `Total Alignments` = sum(Total_Sequences)
  ) %>%
  pander(
    big.mark = ",",
    split.tables = Inf,
    style = "rmarkdown",
    justify = "rrrrrr",
    caption = "*Summary of Library Sizes After Alignment*"
  )
Summary of Library Sizes After Alignment
Population Samples Smallest Library Median Library Largest Library Total Alignments
1996 59 1,541,386 5,253,757 16,146,550 351,966,386
2010 37 1,264,888 4,141,755 7,670,908 163,491,029
2012 53 3,411,367 6,740,247 14,398,379 377,550,470

Identification of Low Quality Samples by GC content

plotGcContent(alnFqc, usePlotly = TRUE, dendrogram = TRUE, theoreticalGC = TRUE, GCobject = oryGC, species = "Ocuniculus")

Difference in GC content of alignments when comparing each sample to Theoretical GC content from sampling the O. cuniculus genome.

alnFqc %>%
    plotGcContent(plotType = "line",usePlotly = TRUE, GCobject = oryGC, species = "Ocuniculus")

GC content of the complete set of samples. The three outlier samples are clearly evident.

lowQ <- paste(c("pt1125", "gc2709", "gc2700"), "bam", sep = ".")

Potential low quality samples were identified by GC content as pt1125.bam, gc2709.bam and gc2700.bam. Alignments from these samples should be moved and placed into a separate folder to ensure their exclusion from the stacks pipeline. The sample gc2776 was also of some concern, but was retained for downstream analysis.

Sequence Content

Inspection of the per base sequence content confirmed the divergent patterns of these samples.

o <- order(fqName(alnFqc))
plotSeqContent(alnFqc[o])
*Per base sequence content of all samples, with divergent patterns clearly identifying the outlier samples*

Per base sequence content of all samples, with divergent patterns clearly identifying the outlier samples

Conclusion

All identified low quality samples were manually moved to a sub-folder indicating their low quality and excluded from the stacks pipeline.

Final Summary

alnFqcPass <- alnFqc[!fqName(alnFqc) %in% lowQ]
list(
  readTotals(deMuxFqc) %>%
    mutate(Sample = str_remove(Filename, ".[12].fq.gz")) %>%
    group_by(Sample) %>%
    summarise(Total_Sequences = sum(Total_Sequences)) %>%
    mutate(Type = "Pre-Alignment"),
  readTotals(alnFqcPass) %>%
    mutate(
      Sample = str_remove(Filename, ".bam"),
      Type = "Post-Alignment") %>%
    dplyr::select(Sample, Type, Total_Sequences)
) %>%
  bind_rows() %>%
  mutate(
    Population = case_when(
      grepl("gc", Sample) ~ "1996",
      grepl("ora", Sample) ~ "2012",
      !grepl("(gc|ora)", Sample) ~ "2010"
    )
  ) %>%
  # filter(Population != "2010") %>%
  spread(key = Type, value = Total_Sequences) %>%
  dplyr::filter(`Post-Alignment` > 0) %>%
  mutate(
    `Alignment Rate` = `Post-Alignment` / `Pre-Alignment`,
  ) %>%
  group_by(Population) %>%
  summarise(
    `Pre-Alignment` = sum(`Pre-Alignment`),
    `Post-Alignment` = sum(`Post-Alignment`),
    minRate =  min(`Alignment Rate`),
    maxRate = max(`Alignment Rate`)
  ) %>%
  bind_rows(
    tibble(
      Population = "Total",
      `Post-Alignment` = sum(.$`Post-Alignment`),
      `Pre-Alignment` = sum(.$`Pre-Alignment`)
    )
  ) %>%
  mutate(
    `Alignment Rate` = `Post-Alignment` / `Pre-Alignment`,
  ) %>%
  mutate_at(
    vars(contains("Rate")), ~percent(., accuracy = 0.01)
  ) %>%
  dplyr::select(
    Population, `Pre-Alignment`, contains("Align"), everything()
  ) %>%
  dplyr::rename(Worst = minRate, Best = maxRate) %>%
  pander(
    big.mark = ",",
    style = "rmarkdown",
    justify = "lrrrrr",
    missing = "",
    emphasize.strong.rows = nrow(.),
    split.table = Inf,
    caption = "*Summary of alignment rates overall and by population, after exclusion of low quality samples.*"
  )
Summary of alignment rates overall and by population, after exclusion of low quality samples.
Population Pre-Alignment Post-Alignment Alignment Rate Worst Best
1996 420,870,972 345,681,486 82.13% 65.73% 85.97%
2010 191,105,136 161,676,155 84.60% 80.43% 86.21%
2012 455,559,218 377,550,470 82.88% 74.09% 86.30%
Total 1,067,535,326 884,908,111 82.89%

Session Info

R version 3.6.2 (2019-12-12)

Platform: x86_64-pc-linux-gnu (64-bit)

locale: LC_CTYPE=en_AU.UTF-8, LC_NUMERIC=C, LC_TIME=en_AU.UTF-8, LC_COLLATE=en_AU.UTF-8, LC_MONETARY=en_AU.UTF-8, LC_MESSAGES=en_AU.UTF-8, LC_PAPER=en_AU.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_AU.UTF-8 and LC_IDENTIFICATION=C

attached base packages: parallel, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: forcats(v.0.4.0), stringr(v.1.4.0), dplyr(v.0.8.3), purrr(v.0.3.3), readr(v.1.3.1), tidyr(v.1.0.0), tidyverse(v.1.3.0), pander(v.0.6.3), scales(v.1.1.0), magrittr(v.1.5), ngsReports(v.1.1.2), tibble(v.2.1.3), ggplot2(v.3.2.1) and BiocGenerics(v.0.32.0)

loaded via a namespace (and not attached): colorspace(v.1.4-1), hwriter(v.1.3.2), ellipsis(v.0.3.0), XVector(v.0.26.0), GenomicRanges(v.1.38.0), ggdendro(v.0.1-20), fs(v.1.3.1), rstudioapi(v.0.10), farver(v.2.0.3), ggrepel(v.0.8.1), fansi(v.0.4.1), lubridate(v.1.7.4), xml2(v.1.2.2), leaps(v.3.1), knitr(v.1.27), zeallot(v.0.1.0), jsonlite(v.1.6), Rsamtools(v.2.2.1), Cairo(v.1.5-10), broom(v.0.5.3), cluster(v.2.1.0), dbplyr(v.1.4.2), png(v.0.1-7), shiny(v.1.4.0), compiler(v.3.6.2), httr(v.1.4.1), backports(v.1.1.5), fastmap(v.1.0.1), assertthat(v.0.2.1), Matrix(v.1.2-18), lazyeval(v.0.2.2), cli(v.2.0.1), later(v.1.0.0), htmltools(v.0.4.0), tools(v.3.6.2), gtable(v.0.3.0), glue(v.1.3.1), GenomeInfoDbData(v.1.2.2), reshape2(v.1.4.3), FactoMineR(v.2.1), ShortRead(v.1.44.1), Rcpp(v.1.0.3), Biobase(v.2.46.0), cellranger(v.1.1.0), vctrs(v.0.2.1), Biostrings(v.2.54.0), nlme(v.3.1-143), crosstalk(v.1.0.0), xfun(v.0.12), rvest(v.0.3.5), mime(v.0.8), lifecycle(v.0.1.0), zlibbioc(v.1.32.0), MASS(v.7.3-51.5), zoo(v.1.8-7), promises(v.1.1.0), hms(v.0.5.3), SummarizedExperiment(v.1.16.1), RColorBrewer(v.1.1-2), yaml(v.2.2.0), latticeExtra(v.0.6-29), stringi(v.1.4.5), highr(v.0.8), S4Vectors(v.0.24.3), BiocParallel(v.1.20.1), truncnorm(v.1.0-8), GenomeInfoDb(v.1.22.0), rlang(v.0.4.2), pkgconfig(v.2.0.3), matrixStats(v.0.55.0), bitops(v.1.0-6), evaluate(v.0.14), lattice(v.0.20-38), GenomicAlignments(v.1.22.1), htmlwidgets(v.1.5.1), labeling(v.0.3), tidyselect(v.0.2.5), plyr(v.1.8.5), R6(v.2.4.1), IRanges(v.2.20.2), generics(v.0.0.2), DelayedArray(v.0.12.2), DBI(v.1.1.0), pillar(v.1.4.3), haven(v.2.2.0), withr(v.2.1.2), scatterplot3d(v.0.3-41), RCurl(v.1.98-1.1), modelr(v.0.1.5), crayon(v.1.3.4), plotly(v.4.9.1), rmarkdown(v.2.1), jpeg(v.0.1-8.1), grid(v.3.6.2), readxl(v.1.3.1), data.table(v.1.12.8), reprex(v.0.3.0), digest(v.0.6.23), flashClust(v.1.01-2), webshot(v.0.5.2), xtable(v.1.8-4), httpuv(v.1.5.2), stats4(v.3.6.2), munsell(v.0.5.0), viridisLite(v.0.3.0) and kableExtra(v.1.1.0)